Thresholding Method for GENIE3 network inference : robustness study
Contexte
On cherche à avoir si notre méthode de seuillage est robuste, on va le faire plein de fois et voir comment se recoupent les résultats.
Nous exécutons cette méthode en inférant un réseau sur les gènes DE entre cnF et CnF (1309 gènes), et en utilisant les valeurs d’expression des gènes dans les conditions cNF, cnF, CNF, CnF. (3*4 colonnes dans la matrice d’expression). On prend une pvalue (quantile à 0.001), on ne normalize pas les profils d’expression à une variance unitaire, et on prend 10 variables espionnes tirées dans des lois de Poisson de moyenne la moyenne globale.
source("Funtions/Network_functions.R")
load("./Data/DEGsListsFiltered.RData")
load("./Data/PlnTFBDRegulatorsList.RData")
load("./Data/normalized.count_At.RData")
load("./Data/OntologyAllGenes.RData")
pval = 0.001
# DE genes
genes <- DEGs[["cnF CnF"]]
print(length(genes))[1] 1309
# expression data
normalized.count <- normalized.count[, grepl("F", colnames(normalized.count))]
head(normalized.count) cNF_3 cNF_2 cNF_1 cnF_2 cnF_1 cnF_3
AT1G01010 1404.48413 1137.14606 1370.49722 1234.59407 975.10562 1079.37253
AT1G01020 382.87379 322.15371 354.90475 337.93987 367.05113 392.30632
AT1G01030 28.53146 16.95546 23.33284 28.08096 36.30176 29.68805
AT1G01040 1947.50224 2044.82826 2109.77977 2181.60035 1918.95140 2193.73455
AT1G01050 1826.93381 1765.62838 1746.27871 2355.89598 2214.40739 2297.64271
AT1G01060 85.59438 99.47202 111.75201 130.72173 87.72925 113.45075
CNF_1 CnF_2 CnF_1 CnF_3 CNF_3 CNF_2
AT1G01010 1464.91355 1068.19624 1184.65014 1186.87210 1342.09996 1267.10176
AT1G01020 387.67033 484.08540 459.57834 488.14901 366.77487 433.48218
AT1G01030 20.58427 44.35090 31.12682 40.89662 24.67096 26.39795
AT1G01040 1953.78985 2363.80881 2189.86332 2117.05265 1444.07324 2059.04036
AT1G01050 2173.35535 2176.02519 2019.58131 2223.20984 1898.01881 2045.14670
AT1G01060 192.11981 82.09635 79.64804 98.32591 159.53884 193.12187
Echantillonnage par shuffling des TFs
Les fonctions utilisées :
get_cor <- function(pair){
tf1 <- str_split_fixed(pair, ' ',2)[1]
tf2 <- str_split_fixed(pair, ' ',2)[2]
return(cor(normalized.count[tf1,], normalized.count[tf2,], method = "spearman"))
}
group_correlated_TFs <- function(normalized.count, regressors, corr_thr = 0.9){
pairs <- data.frame(t(combn(regressors, 2)))
pairs$cor <- sapply(paste(pairs[,1], pairs[,2]), get_cor)
top <- pairs[pairs$cor > corr_thr,]
net_un <- graph_from_data_frame(top, directed = FALSE)
louvain <- cluster_louvain(net_un)
groups <- membership(louvain)
new_reg <- c()
grouped_regs <- data.frame(matrix(nrow = length(unique(groups)), ncol = length(colnames(normalized.count))))
colnames(grouped_regs) <- colnames(normalized.count)
rownames(grouped_regs) <- unique(groups)
for(group in unique(groups)){
tfs <- names(groups)[groups==group]
mean_tf <- colMeans(normalized.count[tfs,])
grouped_regs[group,] <- mean_tf
new_reg <- c(new_reg, paste0("mean_",paste(tfs, collapse = "-")))
}
rownames(grouped_regs) <- new_reg
normalized.count <- rbind.data.frame(normalized.count, grouped_regs)
#remove regressors alone from the data
normalized.count <- normalized.count[!rownames(normalized.count) %in% names(groups),]
return(list(counts = normalized.count,
new_regressors = c(new_reg, regressors[!regressors %in% names(groups)])))
}
insertSpyVariableShuffle <- function(normalized.count, regressors = row.names(normalized.count),
n_spies = 15){
TFs <- sample(regressors, size = n_spies, replace = FALSE)
spies <- data.frame(normalized.count[sample(rownames(normalized.count), size=n_spies),],
row.names = paste0("spy_",seq(1:n_spies)))
for(spy in seq(1:n_spies)){
spies[spy,] <- sample(normalized.count[sample(regressors, size = 1),], size = dim(normalized.count)[2],
replace = FALSE)
}
normalized.count <- rbind.data.frame(normalized.count, spies)
return(normalized.count)
}
getPvalue <- function(value, null_distribution){
return(sum(null_distribution > value)/length(null_distribution))
}
getCorrectedPvalues <- function(null_distribution, importances_to_test, method = 'fdr'){
pvalues <- sapply(importances_to_test, getPvalue, null_distribution = null_distribution)
fdr <- p.adjust(pvalues, method = method)
return(fdr)
}
genie3 <- function(normalized.count, regressors=NA, targets=NA, nTrees=1000, nCores=5,
fdr = 0.1, n_spies = 15, batches = FALSE, group_cor_TFs = FALSE){
# ____________________________________________________________________________
# batch geneie3 ####
# with new spy variables for each run
if(group_cor_TFs){
grouping <- group_correlated_TFs(normalized.count, regressors)
normalized.count <- grouping$counts
regressors <- grouping$new_regressors
targets <- intersect(targets, rownames(normalized.count))
}
if(batches){
mat <- matrix(nrow = length(regressors) + n_spies, ncol = length(targets))
rownames(mat) <- c(regressors, paste0("spy_",seq(1:n_spies)))
colnames(mat) <- targets
if(length(targets) %% 2 == 0){
for(i in seq(1,length(targets), by = 2)){
normalized.count_ <- insertSpyVariableShuffle(normalized.count, regressors,
n_spies = n_spies)
regressors_ = c(regressors, rownames(normalized.count_)[grepl('spy_', rownames(normalized.count_))])
mat[,targets[seq(i,i+1)]] <- GENIE3(as.matrix(normalized.count_), regulators = regressors_,
targets = targets[seq(i,i+1)], treeMethod = "RF", K = "sqrt",
nTrees = nTrees, nCores = nCores, verbose = F)
}
}
if(length(targets) %% 2 == 1){
for(i in seq(1,length(targets)-1, by = 2)){
if(i+2 != length(targets)){
normalized.count_ <- insertSpyVariableShuffle(normalized.count, regressors,
n_spies = n_spies)
regressors_ = c(regressors, rownames(normalized.count_)[grepl('spy_', rownames(normalized.count_))])
b<-mat[,targets[c(i,i+1)]] <- a <- GENIE3(as.matrix(normalized.count_), regulators = regressors_,
targets = targets[seq(i,i+1)], treeMethod = "RF", K = "sqrt",
nTrees = nTrees, nCores = nCores, verbose = F)
}
if(i+2 == length(targets)){
normalized.count_ <- insertSpyVariableShuffle(normalized.count, regressors,
n_spies = n_spies)
regressors_ = c(regressors, rownames(normalized.count_)[grepl('spy_', rownames(normalized.count_))])
mat[,targets[seq(i,i+2)]] <- GENIE3(as.matrix(normalized.count_), regulators = regressors_,
targets = targets[seq(i,i+2)], treeMethod = "RF", K = "sqrt",
nTrees = nTrees, nCores = nCores, verbose = F)
}
}
}
}
# ____________________________________________________________________________
# classic genie3 ####
else{
normalized.count <- insertSpyVariableShuffle(normalized.count, regressors,
n_spies = n_spies)
regressors <- intersect(rownames(normalized.count),regressors)
regressors = c(regressors, rownames(normalized.count)[grepl('spy_', rownames(normalized.count))])
mat <- GENIE3(as.matrix(normalized.count), regulators = regressors, targets = targets,
treeMethod = "RF", K = "sqrt", nTrees = nTrees, nCores = nCores, verbose = T)
}
spies <- c(mat[grepl('spy_', rownames(mat)),])
tfs <- c(mat[!grepl('spy_', rownames(mat)),])
# pvals <- sapply(tfs, getPvalue, null_distribution=spies)
# fdr_ <- getCorrectedPvalues(spies, tfs, method = "fdr")
#
#
d <- data.frame("Importance" = c(tfs, spies), Variable = c(rep("TF", length(tfs)), rep("Spy", length(spies))))
p <- ggplot(data = d, aes(x = Importance, fill = Variable)) +
geom_density(alpha = 0.3) + geom_vline(xintercept = quantile(spies, probs = 1 - pval),
linetype="dotted", size=1.5)
print(p + ggtitle("Distribution des importances pour les variables espionnes et les vrais TFs"))
print(p + ylim(c(0,5)) + ggtitle("Zoom sur la partie avec le seuil (quantile à 1/1000)"))
#
# d <- data.frame("p_values" = c(pvals, fdr_), Correction = c(rep("none", length(pvals)), rep("fdr", length(fdr_))))
# p <- ggplot(data = d, aes(x = p_values, fill = Correction)) +
# geom_density(alpha = 0.3)+ geom_vline(xintercept = fdr,
# linetype="dotted", size=1.5)
# print(p + ggtitle("Distribution des p values, avec correction BH (fdr) ou non"))
links <- getLinkList(mat)
links <- links[!grepl('spy_', links[,1]),]
links$fdr <- getCorrectedPvalues(null_distribution = spies,
importances_to_test = links$weight,
method = "fdr")
d <- data.frame("p_values" = links$fdr)
p <- ggplot(data = d, aes(x = p_values)) +
geom_density(alpha = 0.3)+ geom_vline(xintercept = fdr, linetype="dotted", size=1.5)
print(p + ggtitle("Distribution des p values, avec correction BH (fdr)"))
print(fdr)
print(sum(links$fdr < fdr))
head(sort(links$fdr))
links <- links[links$fdr < fdr,]
print(paste0("Number of links : ", dim(links)[1]))
g <- graph_from_data_frame(links, directed = T)
return(g)
}
genie3_no_spy <- function(normalized.count, regressors=NA, targets=NA, nTrees=1000, nCores=5,
fdr = 0.1, n_spies = 15, batches = FALSE){
# normalized.count <- insertSpyVariableShuffle(normalized.count, regressors,
# n_spies = n_spies)
#
# regressors <- intersect(rownames(normalized.count),regressors)
regressors = c(regressors, rownames(normalized.count)[grepl('spy_', rownames(normalized.count))])
mat <- GENIE3(as.matrix(normalized.count), regulators = regressors, targets = targets,
treeMethod = "RF", K = "sqrt", nTrees = nTrees, nCores = nCores, verbose = T)
spies <- c(mat[grepl('spy_', rownames(mat)),])
tfs <- c(mat[!grepl('spy_', rownames(mat)),])
# pvals <- sapply(tfs, getPvalue, null_distribution=spies)
# fdr_ <- getCorrectedPvalues(spies, tfs, method = "fdr")
#
#
d <- data.frame("Importance" = c(tfs, spies), Variable = c(rep("TF", length(tfs)), rep("Spy", length(spies))))
p <- ggplot(data = d, aes(x = Importance, fill = Variable)) +
geom_density(alpha = 0.3) + geom_vline(xintercept = quantile(spies, probs = 1 - pval),
linetype="dotted", size=1.5)
print(p + ggtitle("Distribution des importances pour les variables espionnes et les vrais TFs"))
print(p + ylim(c(0,5)) + ggtitle("Zoom sur la partie avec le seuil (quantile à 1/1000)"))
#
# d <- data.frame("p_values" = c(pvals, fdr_), Correction = c(rep("none", length(pvals)), rep("fdr", length(fdr_))))
# p <- ggplot(data = d, aes(x = p_values, fill = Correction)) +
# geom_density(alpha = 0.3)+ geom_vline(xintercept = fdr,
# linetype="dotted", size=1.5)
# print(p + ggtitle("Distribution des p values, avec correction BH (fdr) ou non"))
links <- getLinkList(mat)
links <- links[!grepl('spy_', links[,1]),]
links$fdr <- getCorrectedPvalues(null_distribution = spies,
importances_to_test = links$weight,
method = "fdr")
d <- data.frame("p_values" = links$fdr)
p <- ggplot(data = d, aes(x = p_values)) +
geom_density(alpha = 0.3)+ geom_vline(xintercept = fdr, linetype="dotted", size=1.5)
print(p + ggtitle("Distribution des p values, avec correction BH (fdr)"))
print(fdr)
print(sum(links$fdr < fdr))
head(sort(links$fdr))
links <- links[links$fdr < fdr,]
print(paste0("Number of links : ", dim(links)[1]))
g <- graph_from_data_frame(links, directed = T)
return(g)
}Iterations de la méthode
res <- list()
# normalized.count <- insertSpyVariableShuffle(normalized.count, regressors,
# n_spies = 20)
for (i in seq(1:30)) {
res[[i]] <- genie3(normalized.count, regressors = regressors, targets = genes,
fdr = 0.1, n_spies = 10, group_cor_TFs = T, batches = T)
}[1] 0.1
[1] 87
[1] "Number of links : 87"
[1] 0.1
[1] 14
[1] "Number of links : 14"
[1] 0.1
[1] 1385
[1] "Number of links : 1385"
[1] 0.1
[1] 690
[1] "Number of links : 690"
[1] 0.1
[1] 6
[1] "Number of links : 6"
[1] 0.1
[1] 2078
[1] "Number of links : 2078"
[1] 0.1
[1] 5
[1] "Number of links : 5"
[1] 0.1
[1] 833
[1] "Number of links : 833"
[1] 0.1
[1] 2
[1] "Number of links : 2"
[1] 0.1
[1] 19
[1] "Number of links : 19"
[1] 0.1
[1] 772
[1] "Number of links : 772"
[1] 0.1
[1] 170
[1] "Number of links : 170"
[1] 0.1
[1] 6
[1] "Number of links : 6"
[1] 0.1
[1] 1341
[1] "Number of links : 1341"
[1] 0.1
[1] 405
[1] "Number of links : 405"
[1] 0.1
[1] 1042
[1] "Number of links : 1042"
[1] 0.1
[1] 842
[1] "Number of links : 842"
[1] 0.1
[1] 1318
[1] "Number of links : 1318"
[1] 0.1
[1] 1108
[1] "Number of links : 1108"
[1] 0.1
[1] 3
[1] "Number of links : 3"
[1] 0.1
[1] 70
[1] "Number of links : 70"
[1] 0.1
[1] 1
[1] "Number of links : 1"
[1] 0.1
[1] 1051
[1] "Number of links : 1051"
[1] 0.1
[1] 448
[1] "Number of links : 448"
[1] 0.1
[1] 0
[1] "Number of links : 0"
[1] 0.1
[1] 429
[1] "Number of links : 429"
[1] 0.1
[1] 1263
[1] "Number of links : 1263"
[1] 0.1
[1] 1949
[1] "Number of links : 1949"
[1] 0.1
[1] 641
[1] "Number of links : 641"
[1] 0.1
[1] 1418
[1] "Number of links : 1418"
Resultats
load("D:/These/NetworkInference/networks_from_fdrs_tf_groups_batch.RData")
mat <- matrix(nrow = length(res), ncol = length(res))
# length(E(intersection(res[[6]], res[[3]])))
for (i in seq(1:length(res))) {
for (j in seq(1:length(res))) {
mat[i, j] = length(igraph::E(igraph::intersection(res[[i]], res[[j]])))
}
}
library(reshape2)
library(ggplot2)
data <- melt(mat)
library(viridis)
ggplot(data, aes(x = Var1, y = Var2, fill = value)) + geom_tile() + scale_fill_viridis() +
ggtitle("Edges Intersection Length between graphs from different runs")lenghts <- unlist(lapply(res, function(net) {
return(length(igraph::E(net)))
}))
hist(lenghts, breaks = 30)[1] 630.5636
C’est vraiment pas robuste… Du tout. On va implémenter notre méthode de tirer de nouvelles espionnes pour chaque lot de deux gènes, et aussi voir en regroupant les TFs très corrélés en un seul gène.
Même avec notre méthode qui génère des nouvelles spies pour chaque lot de 2 gènes, on n’a pas atteint vraiment de robustesse.
On pourrait : essayer en prenant plus d’arbres, ou en ne prenant pas juste la racine des variables a chaque split.
Je vais aussi essayer de lancer avec les mêmes espionnes sur 30 runs, pour voir quelle est la part de la génération des espionnes dans la variabilité du résultat. Si ça se trouve, c’est pas la faute des espionnes.